import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Read in data
dat = pd.read_csv("/Users/anandigupta/Downloads/nyc_airbnb_listings.csv")
# View data
dat.head()
| listing_id | host_id | host_response_rate | host_acceptance_rate | host_is_superhost | host_listings_count | host_total_listings_count | host_has_profile_pic | host_identity_verified | neighbourhood_group | ... | number_of_reviews_ltm | review_scores_rating | review_scores_accuracy | review_scores_cleanliness | review_scores_checkin | review_scores_communication | review_scores_location | review_scores_value | instant_bookable | reviews_per_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2060 | 2259 | 0.22 | 0.50 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | Manhattan | ... | 0 | 80.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 0.01 |
| 1 | 2595 | 2845 | 0.87 | 0.38 | 0.0 | 6.0 | 6.0 | 1.0 | 1.0 | Manhattan | ... | 5 | 94.0 | 9.0 | 9.0 | 10.0 | 10.0 | 10.0 | 9.0 | 0 | 0.38 |
| 2 | 3831 | 4869 | 0.83 | 0.96 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | Brooklyn | ... | 69 | 90.0 | 9.0 | 9.0 | 10.0 | 10.0 | 10.0 | 8.0 | 0 | 4.71 |
| 3 | 5099 | 7322 | NaN | 0.71 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | Manhattan | ... | 8 | 90.0 | 10.0 | 9.0 | 10.0 | 10.0 | 10.0 | 9.0 | 0 | 0.59 |
| 4 | 5114 | 7345 | 0.50 | NaN | 0.0 | 3.0 | 3.0 | 1.0 | 0.0 | Manhattan | ... | 0 | 94.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 0 | 0.56 |
5 rows × 36 columns
# Check data types
dat.dtypes
listing_id int64 host_id int64 host_response_rate float64 host_acceptance_rate float64 host_is_superhost float64 host_listings_count float64 host_total_listings_count float64 host_has_profile_pic float64 host_identity_verified float64 neighbourhood_group object room_type object accommodates int64 bathrooms float64 bedrooms float64 beds float64 price int64 security_deposit float64 cleaning_fee float64 guests_included int64 extra_people int64 has_availability int64 availability_30 int64 availability_60 int64 availability_90 int64 availability_365 int64 number_of_reviews int64 number_of_reviews_ltm int64 review_scores_rating float64 review_scores_accuracy float64 review_scores_cleanliness float64 review_scores_checkin float64 review_scores_communication float64 review_scores_location float64 review_scores_value float64 instant_bookable int64 reviews_per_month float64 dtype: object
#examine what the object type values look like as they can represent mixed types
dat.neighbourhood_group.unique()
array(['Manhattan', 'Brooklyn', 'Queens', 'Staten Island', 'Bronx'],
dtype=object)
dat.room_type.unique()
array(['Private room', 'Entire home/apt', 'Shared room', 'Hotel room'],
dtype=object)
The dataframe consists of variables of 3 types - integers, floats, and objects. ID numbers, quantitative metrics such as listing count, no. of bedrooms or bathrooms, and host acceptance rates, and dummy variables indicating features such as whether the listing is instantly bookable or has availability are stored as floats or integers. Room type and neighborhood group are categorical variables (each category is a string).
#Print dimensions
dat.shape
(50796, 36)
The dataset has 50796 observations and 36 variables.
#sum number of nas in each column
dat.isna().sum()
listing_id 0 host_id 0 host_response_rate 19006 host_acceptance_rate 14015 host_is_superhost 5 host_listings_count 5 host_total_listings_count 5 host_has_profile_pic 5 host_identity_verified 5 neighbourhood_group 0 room_type 0 accommodates 0 bathrooms 54 bedrooms 77 beds 482 price 0 security_deposit 17325 cleaning_fee 10528 guests_included 0 extra_people 0 has_availability 0 availability_30 0 availability_60 0 availability_90 0 availability_365 0 number_of_reviews 0 number_of_reviews_ltm 0 review_scores_rating 11431 review_scores_accuracy 11466 review_scores_cleanliness 11452 review_scores_checkin 11479 review_scores_communication 11463 review_scores_location 11482 review_scores_value 11482 instant_bookable 0 reviews_per_month 10453 dtype: int64
The dataset is relatively complete for most variables, but attributes such as the host acceptance rate, security deposit and cleaning fee, and information about the review scores are more sparsely populated (about 10,000 - 20,000 observations missing).
#create correlation matrix
corrM = dat.corr()
corrM
| listing_id | host_id | host_response_rate | host_acceptance_rate | host_is_superhost | host_listings_count | host_total_listings_count | host_has_profile_pic | host_identity_verified | accommodates | ... | number_of_reviews_ltm | review_scores_rating | review_scores_accuracy | review_scores_cleanliness | review_scores_checkin | review_scores_communication | review_scores_location | review_scores_value | instant_bookable | reviews_per_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| listing_id | 1.000000 | 0.602992 | 0.001262 | 0.173659 | -0.018315 | 0.095601 | 0.095601 | -0.008842 | -0.378890 | 0.061642 | ... | -0.021945 | -0.020767 | -0.051783 | 0.013608 | -0.072445 | -0.084298 | 0.036115 | -0.015834 | 0.301018 | 0.239285 |
| host_id | 0.602992 | 1.000000 | -0.006617 | 0.204172 | -0.024815 | 0.005862 | 0.005862 | -0.032739 | -0.477734 | 0.080800 | ... | 0.055112 | -0.112733 | -0.124457 | -0.031832 | -0.117417 | -0.137721 | -0.036859 | -0.097275 | 0.285985 | 0.265653 |
| host_response_rate | 0.001262 | -0.006617 | 1.000000 | 0.351554 | 0.184319 | 0.056084 | 0.056084 | 0.028286 | -0.007838 | 0.045398 | ... | 0.148508 | 0.085358 | 0.088140 | 0.100042 | 0.079643 | 0.091062 | 0.041354 | 0.073904 | 0.066077 | 0.157114 |
| host_acceptance_rate | 0.173659 | 0.204172 | 0.351554 | 1.000000 | 0.212421 | 0.031606 | 0.031606 | 0.027714 | -0.130496 | 0.099578 | ... | 0.290707 | -0.024459 | -0.001184 | 0.037585 | -0.002236 | -0.017243 | -0.008994 | -0.002376 | 0.369932 | 0.341515 |
| host_is_superhost | -0.018315 | -0.024815 | 0.184319 | 0.212421 | 1.000000 | 0.041023 | 0.041023 | 0.010434 | 0.031779 | 0.068562 | ... | 0.390701 | 0.195336 | 0.187935 | 0.229011 | 0.148060 | 0.150062 | 0.097842 | 0.185606 | 0.011973 | 0.309221 |
| host_listings_count | 0.095601 | 0.005862 | 0.056084 | 0.031606 | 0.041023 | 1.000000 | 1.000000 | 0.006609 | -0.046198 | -0.016406 | ... | -0.064539 | -0.020771 | -0.024317 | 0.003697 | -0.024194 | -0.056838 | 0.003392 | -0.057233 | 0.060790 | -0.043061 |
| host_total_listings_count | 0.095601 | 0.005862 | 0.056084 | 0.031606 | 0.041023 | 1.000000 | 1.000000 | 0.006609 | -0.046198 | -0.016406 | ... | -0.064539 | -0.020771 | -0.024317 | 0.003697 | -0.024194 | -0.056838 | 0.003392 | -0.057233 | 0.060790 | -0.043061 |
| host_has_profile_pic | -0.008842 | -0.032739 | 0.028286 | 0.027714 | 0.010434 | 0.006609 | 0.006609 | 1.000000 | 0.038597 | -0.006434 | ... | 0.011128 | 0.026317 | 0.016763 | 0.019061 | 0.021232 | 0.021010 | 0.004430 | 0.012378 | 0.001804 | 0.011665 |
| host_identity_verified | -0.378890 | -0.477734 | -0.007838 | -0.130496 | 0.031779 | -0.046198 | -0.046198 | 0.038597 | 1.000000 | -0.020823 | ... | -0.042429 | 0.072695 | 0.078814 | 0.032247 | 0.074222 | 0.092728 | 0.028414 | 0.065824 | -0.200653 | -0.157247 |
| accommodates | 0.061642 | 0.080800 | 0.045398 | 0.099578 | 0.068562 | -0.016406 | -0.016406 | -0.006434 | -0.020823 | 1.000000 | ... | 0.104212 | -0.000080 | -0.010617 | 0.039792 | 0.007854 | -0.005861 | -0.010258 | -0.025103 | 0.031984 | 0.122069 |
| bathrooms | 0.056949 | 0.057874 | 0.019772 | 0.025526 | 0.003550 | -0.001744 | -0.001744 | 0.004824 | -0.042181 | 0.354561 | ... | -0.016568 | -0.005595 | -0.019080 | -0.014408 | -0.007311 | -0.032106 | -0.020089 | -0.010030 | 0.021969 | -0.000289 |
| bedrooms | 0.037970 | 0.052067 | 0.024535 | 0.045532 | 0.036925 | -0.043568 | -0.043568 | -0.004448 | -0.016906 | 0.653509 | ... | 0.037931 | 0.013147 | -0.001721 | 0.024562 | 0.013985 | 0.004829 | -0.022312 | 0.008254 | 0.000706 | 0.055252 |
| beds | 0.047793 | 0.086926 | 0.052383 | 0.085553 | 0.065074 | -0.029231 | -0.029231 | -0.005455 | -0.036156 | 0.740344 | ... | 0.091224 | -0.007402 | -0.015761 | 0.022708 | 0.004943 | -0.010170 | -0.016775 | -0.019348 | 0.021680 | 0.104406 |
| price | 0.045653 | 0.069123 | -0.053930 | 0.021025 | -0.018002 | 0.025507 | 0.025507 | 0.001499 | -0.026756 | 0.178947 | ... | -0.031160 | -0.039119 | -0.053565 | 0.008668 | -0.042648 | -0.051825 | 0.039021 | -0.061547 | 0.036686 | 0.018566 |
| security_deposit | -0.118435 | -0.098430 | -0.057194 | -0.116541 | -0.011260 | 0.058874 | 0.058874 | -0.013096 | 0.068275 | 0.142695 | ... | -0.105009 | 0.015655 | -0.002014 | 0.028637 | -0.005121 | -0.002638 | 0.019689 | -0.039846 | -0.115657 | -0.131281 |
| cleaning_fee | 0.049106 | -0.000953 | 0.028696 | -0.016993 | 0.028845 | 0.453531 | 0.453531 | -0.012065 | -0.026663 | 0.399469 | ... | -0.103222 | 0.036119 | 0.001527 | 0.062655 | -0.001827 | -0.001112 | 0.072074 | -0.049634 | -0.013670 | -0.098864 |
| guests_included | 0.004008 | 0.044929 | 0.061557 | 0.097486 | 0.119141 | -0.026755 | -0.026755 | 0.002060 | 0.001040 | 0.567537 | ... | 0.145043 | 0.020984 | 0.021922 | 0.054886 | 0.033460 | 0.022656 | -0.011507 | 0.006743 | 0.027237 | 0.127048 |
| extra_people | -0.041224 | -0.006428 | 0.025930 | 0.016112 | 0.103724 | -0.068967 | -0.068967 | 0.006684 | 0.016055 | 0.104944 | ... | 0.111235 | 0.033547 | 0.026619 | 0.056248 | 0.027462 | 0.025534 | 0.025424 | 0.015567 | -0.010170 | 0.068929 |
| has_availability | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| availability_30 | 0.230724 | 0.290960 | -0.018577 | 0.061737 | 0.153433 | 0.004393 | 0.004393 | -0.026163 | -0.170484 | 0.125943 | ... | 0.267006 | -0.078834 | -0.080155 | 0.020643 | -0.051604 | -0.078296 | -0.031946 | -0.102707 | 0.114008 | 0.319761 |
| availability_60 | 0.244280 | 0.306300 | -0.032390 | 0.041812 | 0.137478 | 0.039899 | 0.039899 | -0.024280 | -0.183010 | 0.114271 | ... | 0.225766 | -0.090592 | -0.094618 | 0.012235 | -0.062996 | -0.091817 | -0.045797 | -0.119764 | 0.109169 | 0.283795 |
| availability_90 | 0.250602 | 0.311289 | -0.042713 | 0.029489 | 0.135220 | 0.059292 | 0.059292 | -0.020580 | -0.187162 | 0.107593 | ... | 0.213441 | -0.092727 | -0.098288 | 0.010361 | -0.065559 | -0.095661 | -0.050263 | -0.123637 | 0.105531 | 0.272230 |
| availability_365 | 0.137048 | 0.239372 | -0.009247 | 0.036696 | 0.153689 | 0.146280 | 0.146280 | -0.011574 | -0.137903 | 0.157548 | ... | 0.191389 | -0.091103 | -0.097835 | 0.004917 | -0.061368 | -0.099078 | -0.053056 | -0.136669 | 0.096242 | 0.217776 |
| number_of_reviews | -0.306478 | -0.136498 | 0.110972 | 0.189414 | 0.325899 | -0.063099 | -0.063099 | 0.011068 | 0.103037 | 0.070227 | ... | 0.759504 | 0.042106 | 0.081859 | 0.083339 | 0.086608 | 0.072147 | 0.034764 | 0.060040 | 0.029517 | 0.615659 |
| number_of_reviews_ltm | -0.021945 | 0.055112 | 0.148508 | 0.290707 | 0.390701 | -0.064539 | -0.064539 | 0.011128 | -0.042429 | 0.104212 | ... | 1.000000 | 0.049662 | 0.083217 | 0.103912 | 0.078585 | 0.064344 | 0.059858 | 0.066158 | 0.113588 | 0.857747 |
| review_scores_rating | -0.020767 | -0.112733 | 0.085358 | -0.024459 | 0.195336 | -0.020771 | -0.020771 | 0.026317 | 0.072695 | -0.000080 | ... | 0.049662 | 1.000000 | 0.760376 | 0.715825 | 0.612325 | 0.675841 | 0.471095 | 0.769935 | -0.075904 | 0.044755 |
| review_scores_accuracy | -0.051783 | -0.124457 | 0.088140 | -0.001184 | 0.187935 | -0.024317 | -0.024317 | 0.016763 | 0.078814 | -0.010617 | ... | 0.083217 | 0.760376 | 1.000000 | 0.608828 | 0.593161 | 0.619566 | 0.439064 | 0.696670 | -0.065149 | 0.066410 |
| review_scores_cleanliness | 0.013608 | -0.031832 | 0.100042 | 0.037585 | 0.229011 | 0.003697 | 0.003697 | 0.019061 | 0.032247 | 0.039792 | ... | 0.103912 | 0.715825 | 0.608828 | 1.000000 | 0.450519 | 0.469442 | 0.355199 | 0.615790 | -0.035976 | 0.111100 |
| review_scores_checkin | -0.072445 | -0.117417 | 0.079643 | -0.002236 | 0.148060 | -0.024194 | -0.024194 | 0.021232 | 0.074222 | 0.007854 | ... | 0.078585 | 0.612325 | 0.593161 | 0.450519 | 1.000000 | 0.669064 | 0.394343 | 0.541789 | -0.063139 | 0.060246 |
| review_scores_communication | -0.084298 | -0.137721 | 0.091062 | -0.017243 | 0.150062 | -0.056838 | -0.056838 | 0.021010 | 0.092728 | -0.005861 | ... | 0.064344 | 0.675841 | 0.619566 | 0.469442 | 0.669064 | 1.000000 | 0.415460 | 0.587376 | -0.080219 | 0.044498 |
| review_scores_location | 0.036115 | -0.036859 | 0.041354 | -0.008994 | 0.097842 | 0.003392 | 0.003392 | 0.004430 | 0.028414 | -0.010258 | ... | 0.059858 | 0.471095 | 0.439064 | 0.355199 | 0.394343 | 0.415460 | 1.000000 | 0.470615 | -0.033156 | 0.058734 |
| review_scores_value | -0.015834 | -0.097275 | 0.073904 | -0.002376 | 0.185606 | -0.057233 | -0.057233 | 0.012378 | 0.065824 | -0.025103 | ... | 0.066158 | 0.769935 | 0.696670 | 0.615790 | 0.541789 | 0.587376 | 0.470615 | 1.000000 | -0.057230 | 0.067993 |
| instant_bookable | 0.301018 | 0.285985 | 0.066077 | 0.369932 | 0.011973 | 0.060790 | 0.060790 | 0.001804 | -0.200653 | 0.031984 | ... | 0.113588 | -0.075904 | -0.065149 | -0.035976 | -0.063139 | -0.080219 | -0.033156 | -0.057230 | 1.000000 | 0.196509 |
| reviews_per_month | 0.239285 | 0.265653 | 0.157114 | 0.341515 | 0.309221 | -0.043061 | -0.043061 | 0.011665 | -0.157247 | 0.122069 | ... | 0.857747 | 0.044755 | 0.066410 | 0.111100 | 0.060246 | 0.044498 | 0.058734 | 0.067993 | 0.196509 | 1.000000 |
34 rows × 34 columns
#Plot correlation heatmap
mask = np.triu(np.ones_like(dat.corr()))
plt.figure(figsize = (15,15))
sns.heatmap(corrM,center=0,linewidths=.5,cmap="magma", annot = False, mask = mask)
plt.title("Correlation between Attributes of Airbnb Listings", fontsize = 15)
plt.show()
It seems that there is some level of multicollinearity between the variables, but correlation between the various attributes is generally fairly modest (between -0.5 and 0.5 for most attributes). However, there are a few attributes such as availability_30 and availability_60, beds and bedrooms, host listings count and host total listings count, and reviews per month and number of reviews ltm that are either very highly correlated or perfectly correlated, indicating that multicollinearity is an issue for some variables (note, this list is not exhaustive).
#Drop NAs
dat= dat.dropna()
#check shape after dropping values
dat.shape
(20115, 36)
Note, dropping missing values reduces the number of observations substantially
#Convert categorical variables into dummies
dat = pd.get_dummies(dat, columns=['neighbourhood_group', 'room_type'])
#check dummy conversion worked
dat.head()
| listing_id | host_id | host_response_rate | host_acceptance_rate | host_is_superhost | host_listings_count | host_total_listings_count | host_has_profile_pic | host_identity_verified | accommodates | ... | reviews_per_month | neighbourhood_group_Bronx | neighbourhood_group_Brooklyn | neighbourhood_group_Manhattan | neighbourhood_group_Queens | neighbourhood_group_Staten Island | room_type_Entire home/apt | room_type_Hotel room | room_type_Private room | room_type_Shared room | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2595 | 2845 | 0.87 | 0.38 | 0.0 | 6.0 | 6.0 | 1.0 | 1.0 | 2 | ... | 0.38 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 8 | 5238 | 7549 | 1.00 | 0.26 | 1.0 | 4.0 | 4.0 | 1.0 | 1.0 | 3 | ... | 1.26 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 9 | 5441 | 7989 | 1.00 | 0.56 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2 | ... | 1.59 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 10 | 5552 | 8380 | 1.00 | 0.20 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2 | ... | 0.51 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 11 | 5803 | 9744 | 1.00 | 0.99 | 1.0 | 3.0 | 3.0 | 1.0 | 1.0 | 2 | ... | 1.35 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 43 columns
# Set features and target
X = dat.drop("price", axis=1)
y = dat[["price"]]
# Split data and set seed
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# For running models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
# For evaluating our model's performance
import sklearn.metrics as m
def r2_scores(ytrue, ypredicted, p):
"""
Converts outputs from model to tuple with R squared and Adjusted R squared
Arguments
---------
ytrue: vector
true values of outcome variable
ypred: vector
predicted values of outcome variable
p: int
number of predictor variables
return
------
A tuple with R Squared and Adjusted R Squared
"""
#Use sklearn for r squared
r2 = m.r2_score(ytrue, ypredicted)
#Calculate number of observations
n = len(ytrue)
#Apply formula to convert R squared to adjusted R squared (p is no. of features)
#Adjusted R2 formula here: http://net-informations.com/ds/psa/adjusted.htm
adjusted_r2 = 1 - ((1-r2) * (n-1)/(n-p-1))
#Create tuple with r2 and adjusted r2
output = (r2, adjusted_r2)
#return tuple created above
return output
def reg_model(model, features, target, alpha = 1.0):
# Instantiate model
if model == "LinearRegression":
mod = LinearRegression()
elif model == "Ridge":
mod = Ridge(alpha=alpha)
else:
mod = Lasso(alpha=alpha)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# Fit model
mod.fit(X_train, y_train)
# Make predictions
y_train_pred = mod.predict(X_train)
y_test_pred = mod.predict(X_test)
#calculate R2 and adjusted R2 using previously defined function r2_scores
#input for p is total number of predictor variables (columns in X)
r2_scores_output = r2_scores(y_test, y_test_pred, len(X_test.columns))
#convert model coefficients into data frames
#linear regression coef dataset is wide and needs to be transposed for ease later
if model == "LinearRegression":
mod_df = pd.DataFrame(mod.coef_).T
mod_df.index = X.columns
mod_df = mod_df.reset_index()
mod_df.columns = ["feature", "coef"]
else:
mod_df = pd.DataFrame(mod.coef_,
index=X.columns)
mod_df = mod_df.reset_index()
mod_df.columns = ["feature", "coef"]
#output coefficient df and r2 and adjusted r2 tuple
return mod_df, r2_scores_output
#call linear regression model
lr = reg_model("LinearRegression", X, y)
#pull coefficient df (1st item in output)
lr_df = lr[0]
lr_df
| feature | coef | |
|---|---|---|
| 0 | listing_id | 7.263481e-08 |
| 1 | host_id | 1.103437e-07 |
| 2 | host_response_rate | -9.207092e+01 |
| 3 | host_acceptance_rate | 2.066896e+01 |
| 4 | host_is_superhost | 4.663714e+00 |
| 5 | host_listings_count | -2.609260e-02 |
| 6 | host_total_listings_count | -2.609259e-02 |
| 7 | host_has_profile_pic | 5.911220e+01 |
| 8 | host_identity_verified | 1.015928e+01 |
| 9 | accommodates | 2.066332e+01 |
| 10 | bathrooms | 6.385512e+01 |
| 11 | bedrooms | 3.093114e+01 |
| 12 | beds | -1.109014e+01 |
| 13 | security_deposit | 3.740903e-02 |
| 14 | cleaning_fee | 3.393742e-01 |
| 15 | guests_included | -1.335756e+00 |
| 16 | extra_people | -2.998091e-02 |
| 17 | has_availability | 2.320526e-10 |
| 18 | availability_30 | 1.876830e-01 |
| 19 | availability_60 | 5.893390e-01 |
| 20 | availability_90 | -9.355873e-02 |
| 21 | availability_365 | 2.689430e-02 |
| 22 | number_of_reviews | -1.232301e-02 |
| 23 | number_of_reviews_ltm | -7.330347e-01 |
| 24 | review_scores_rating | -1.079706e+00 |
| 25 | review_scores_accuracy | -2.214675e+00 |
| 26 | review_scores_cleanliness | 3.040266e+01 |
| 27 | review_scores_checkin | -9.029020e+00 |
| 28 | review_scores_communication | -7.973170e+00 |
| 29 | review_scores_location | 3.987449e+01 |
| 30 | review_scores_value | -2.100572e+01 |
| 31 | instant_bookable | 1.173095e+01 |
| 32 | reviews_per_month | 3.793778e+00 |
| 33 | neighbourhood_group_Bronx | -1.465481e+01 |
| 34 | neighbourhood_group_Brooklyn | 5.221585e+00 |
| 35 | neighbourhood_group_Manhattan | 7.090926e+01 |
| 36 | neighbourhood_group_Queens | -1.680904e+01 |
| 37 | neighbourhood_group_Staten Island | -4.466699e+01 |
| 38 | room_type_Entire home/apt | 8.940447e+00 |
| 39 | room_type_Hotel room | 7.070027e+01 |
| 40 | room_type_Private room | -1.400749e+01 |
| 41 | room_type_Shared room | -6.563323e+01 |
#plot coefficients
fig = px.bar(lr_df, x='feature', y='coef')
fig.show()
#call lasso model with alpha = 0.5
lasso_1 = reg_model("Lasso", X, y, 0.5)
#pull coefficient df (1st item in output)
lasso_1_df = lasso_1[0]
lasso_1_df
| feature | coef | |
|---|---|---|
| 0 | listing_id | 1.445653e-07 |
| 1 | host_id | 1.126578e-07 |
| 2 | host_response_rate | -5.896406e+01 |
| 3 | host_acceptance_rate | 0.000000e+00 |
| 4 | host_is_superhost | 1.339666e+00 |
| 5 | host_listings_count | -5.187685e-02 |
| 6 | host_total_listings_count | -3.598211e-17 |
| 7 | host_has_profile_pic | 0.000000e+00 |
| 8 | host_identity_verified | 7.855368e+00 |
| 9 | accommodates | 2.090092e+01 |
| 10 | bathrooms | 5.987187e+01 |
| 11 | bedrooms | 2.927051e+01 |
| 12 | beds | -1.034718e+01 |
| 13 | security_deposit | 3.842311e-02 |
| 14 | cleaning_fee | 3.614561e-01 |
| 15 | guests_included | -3.587819e-01 |
| 16 | extra_people | -4.600373e-02 |
| 17 | has_availability | 0.000000e+00 |
| 18 | availability_30 | 2.455018e-01 |
| 19 | availability_60 | 5.706176e-01 |
| 20 | availability_90 | -1.251273e-01 |
| 21 | availability_365 | 3.125181e-02 |
| 22 | number_of_reviews | -6.197030e-03 |
| 23 | number_of_reviews_ltm | -6.155097e-01 |
| 24 | review_scores_rating | -1.223783e+00 |
| 25 | review_scores_accuracy | -1.199153e+00 |
| 26 | review_scores_cleanliness | 2.951362e+01 |
| 27 | review_scores_checkin | -8.300890e+00 |
| 28 | review_scores_communication | -7.374908e+00 |
| 29 | review_scores_location | 3.876521e+01 |
| 30 | review_scores_value | -1.995703e+01 |
| 31 | instant_bookable | 1.126203e+01 |
| 32 | reviews_per_month | 3.041574e+00 |
| 33 | neighbourhood_group_Bronx | -0.000000e+00 |
| 34 | neighbourhood_group_Brooklyn | 1.416776e+01 |
| 35 | neighbourhood_group_Manhattan | 8.065991e+01 |
| 36 | neighbourhood_group_Queens | -3.365926e+00 |
| 37 | neighbourhood_group_Staten Island | -0.000000e+00 |
| 38 | room_type_Entire home/apt | 1.822330e+01 |
| 39 | room_type_Hotel room | 3.891499e+01 |
| 40 | room_type_Private room | -2.294152e-03 |
| 41 | room_type_Shared room | -2.908694e+01 |
#plot coefficients
fig = px.bar(lasso_1_df, x='feature', y='coef')
fig.show()
#count number of coefficients shrunk to 0
zero_coef = lasso_1_df[lasso_1_df['coef'] == 0]
print(len(zero_coef))
5
#call lasso model with alpha = 5.0
lasso_2 = reg_model("Lasso", X, y, 5.0)
#pull coefficient df (1st item in output)
lasso_2_df = lasso_2[0]
lasso_2_df
| feature | coef | |
|---|---|---|
| 0 | listing_id | 3.221587e-07 |
| 1 | host_id | 1.189364e-07 |
| 2 | host_response_rate | -0.000000e+00 |
| 3 | host_acceptance_rate | 0.000000e+00 |
| 4 | host_is_superhost | -0.000000e+00 |
| 5 | host_listings_count | -5.084476e-02 |
| 6 | host_total_listings_count | -0.000000e+00 |
| 7 | host_has_profile_pic | 0.000000e+00 |
| 8 | host_identity_verified | 0.000000e+00 |
| 9 | accommodates | 2.069361e+01 |
| 10 | bathrooms | 3.102519e+01 |
| 11 | bedrooms | 1.588521e+01 |
| 12 | beds | -0.000000e+00 |
| 13 | security_deposit | 4.315832e-02 |
| 14 | cleaning_fee | 5.356044e-01 |
| 15 | guests_included | 0.000000e+00 |
| 16 | extra_people | -5.533472e-02 |
| 17 | has_availability | 0.000000e+00 |
| 18 | availability_30 | 5.313326e-01 |
| 19 | availability_60 | 3.198996e-01 |
| 20 | availability_90 | -9.510029e-02 |
| 21 | availability_365 | 3.733521e-02 |
| 22 | number_of_reviews | -0.000000e+00 |
| 23 | number_of_reviews_ltm | -4.277014e-01 |
| 24 | review_scores_rating | -2.008845e+00 |
| 25 | review_scores_accuracy | -0.000000e+00 |
| 26 | review_scores_cleanliness | 1.607250e+01 |
| 27 | review_scores_checkin | -0.000000e+00 |
| 28 | review_scores_communication | -0.000000e+00 |
| 29 | review_scores_location | 2.403447e+01 |
| 30 | review_scores_value | -3.785681e+00 |
| 31 | instant_bookable | 0.000000e+00 |
| 32 | reviews_per_month | 0.000000e+00 |
| 33 | neighbourhood_group_Bronx | -0.000000e+00 |
| 34 | neighbourhood_group_Brooklyn | -0.000000e+00 |
| 35 | neighbourhood_group_Manhattan | 5.543472e+01 |
| 36 | neighbourhood_group_Queens | -0.000000e+00 |
| 37 | neighbourhood_group_Staten Island | -0.000000e+00 |
| 38 | room_type_Entire home/apt | 0.000000e+00 |
| 39 | room_type_Hotel room | 0.000000e+00 |
| 40 | room_type_Private room | -0.000000e+00 |
| 41 | room_type_Shared room | -0.000000e+00 |
#plot coefficients
fig = px.bar(lasso_2_df, x='feature', y='coef')
fig.show()
#count number of coefficients shrunk to 0
zero_coef = lasso_2_df[lasso_2_df['coef'] == 0]
print(len(zero_coef))
23
As we can see from the above results, as the penalty for the LASSO regression increases, the number of coefficients shrinking to 0 increases from 5 (alpha = 0.5) to 23 (alpha = 5.0)
#Note to TAs: I already apply the r2_score function I defined in question 3
#to each model within the reg_model function used in question 4
#so I don't repeat those steps here
#call r2 tuple (2nd item in model output for linear regression)
r2_score_lr = lr[1]
r2_score_lr
(0.14603542041754225, 0.14004968738308587)
#call r2 tuple (2nd item in model output for lasso with 0.5 alpha)
r2_score_lasso_1 = lasso_1[1]
r2_score_lasso_1
(0.14648753178625928, 0.14050496775672372)
#call r2 tuple (2nd item in model output for lasso with 5.0 alpha)
r2_score_lasso_2 = lasso_2[1]
r2_score_lasso_2
(0.1391478636079413, 0.1331138533061278)
We see that the lasso model with alpha = 0.5 performs very similarly to the linear regression model (does slightly better on both the R squared and adjusted R squared metrics). However, as alpha increases to 5.0, the Lasso regression does worse on both the R squared and adjusted R squared metrics. This is likely because there are correlated features in the regression, and the LASSO regression randomly picks one feature and sets the remaining correlated features to 0, leading to loss of information and lower accuracy.
#Merge the 2 relevant coefficient dataframes on variable name
coeff_df = lr_df.merge(lasso_2_df, on ='feature', how='left')
#rename columns for clarity
coeff_df = coeff_df.rename(columns={"coef_x": "Linear Regression Coefficients", "coef_y": "LASSO (5.0) Coefficients"})
#display rounded coefficients for readability
coeff_df.round(6)
| feature | Linear Regression Coefficients | LASSO (5.0) Coefficients | |
|---|---|---|---|
| 0 | listing_id | 0.000000 | 0.000000 |
| 1 | host_id | 0.000000 | 0.000000 |
| 2 | host_response_rate | -92.070916 | -0.000000 |
| 3 | host_acceptance_rate | 20.668963 | 0.000000 |
| 4 | host_is_superhost | 4.663714 | -0.000000 |
| 5 | host_listings_count | -0.026093 | -0.050845 |
| 6 | host_total_listings_count | -0.026093 | -0.000000 |
| 7 | host_has_profile_pic | 59.112200 | 0.000000 |
| 8 | host_identity_verified | 10.159283 | 0.000000 |
| 9 | accommodates | 20.663315 | 20.693611 |
| 10 | bathrooms | 63.855124 | 31.025195 |
| 11 | bedrooms | 30.931136 | 15.885209 |
| 12 | beds | -11.090144 | -0.000000 |
| 13 | security_deposit | 0.037409 | 0.043158 |
| 14 | cleaning_fee | 0.339374 | 0.535604 |
| 15 | guests_included | -1.335756 | 0.000000 |
| 16 | extra_people | -0.029981 | -0.055335 |
| 17 | has_availability | 0.000000 | 0.000000 |
| 18 | availability_30 | 0.187683 | 0.531333 |
| 19 | availability_60 | 0.589339 | 0.319900 |
| 20 | availability_90 | -0.093559 | -0.095100 |
| 21 | availability_365 | 0.026894 | 0.037335 |
| 22 | number_of_reviews | -0.012323 | -0.000000 |
| 23 | number_of_reviews_ltm | -0.733035 | -0.427701 |
| 24 | review_scores_rating | -1.079706 | -2.008845 |
| 25 | review_scores_accuracy | -2.214675 | -0.000000 |
| 26 | review_scores_cleanliness | 30.402660 | 16.072501 |
| 27 | review_scores_checkin | -9.029020 | -0.000000 |
| 28 | review_scores_communication | -7.973170 | -0.000000 |
| 29 | review_scores_location | 39.874488 | 24.034473 |
| 30 | review_scores_value | -21.005724 | -3.785681 |
| 31 | instant_bookable | 11.730950 | 0.000000 |
| 32 | reviews_per_month | 3.793778 | 0.000000 |
| 33 | neighbourhood_group_Bronx | -14.654808 | -0.000000 |
| 34 | neighbourhood_group_Brooklyn | 5.221585 | -0.000000 |
| 35 | neighbourhood_group_Manhattan | 70.909256 | 55.434723 |
| 36 | neighbourhood_group_Queens | -16.809043 | -0.000000 |
| 37 | neighbourhood_group_Staten Island | -44.666990 | -0.000000 |
| 38 | room_type_Entire home/apt | 8.940447 | 0.000000 |
| 39 | room_type_Hotel room | 70.700271 | 0.000000 |
| 40 | room_type_Private room | -14.007486 | -0.000000 |
| 41 | room_type_Shared room | -65.633233 | -0.000000 |